complete herbarium data

Author

Yi Liu

Published

June 5, 2024

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

1 get herbarium data

raw <- read.csv("/Volumes/seas-zhukai/phenology/phenology_leaf_flower_lag/Herbarim_flower/combined_occ_img_downloaded.csv") %>% 
  dplyr::select(day, month, year, startDayOfYear, coordinateUncertaintyInMeters, decimalLongitude, decimalLatitude, filename_image, species, genus,family)

phenology <- read.csv("/Volumes/seas-zhukai/phenology/phenology_leaf_flower_lag/Herbarim_flower/phenology.csv") %>% 
  mutate(filename_image = gsub(".txt", "", file_name))

joint_data <- left_join(phenology,raw, by = "filename_image")

1.1 fillter flowering time

joint_data_flower <- joint_data %>% #116805
  filter(flower_one > 0 & flower_many > 0) %>% 
  filter(!is.na(startDayOfYear)) %>% 
  filter(year>=1895) %>% 
  dplyr::select(decimalLongitude, decimalLatitude, startDayOfYear, year, species, genus, family) %>%
  # delete family Pinaceae and Cupressaceae
  filter(family!="Pinaceae" & family!="Cupressaceae") %>%
  rename(lon = decimalLongitude, lat = decimalLatitude, doy = startDayOfYear) %>% 
  distinct()  # clear herbarium data for repeat file and repeat file with different phenology

# two repeated component:
# 1. completely the same ~300
# 2. same specimen (different name) with different phenology (as long as the flower is consistent, we will keep them) ~2000

2 extract the climate normality

library(raster)

# extract the climate normality
complete_period_raster <- raster("../data/prism/complete_period_springmean.tif")

joint_data_flower_normality <- joint_data_flower %>%
  dplyr::select(lat, lon) %>%
  distinct() %>%
  mutate(complete_period_temp = extract(complete_period_raster, cbind(lon, lat)))

# extract the climate anormality
# Initialize an empty data frame to store the results
joint_data_flower_anormality <- data.frame()

# Loop through the specified years
for (fo_year in 1895:2023) {
  # Load the yearly raster file
  yearly_raster <- raster(paste0("../data/prism/", fo_year, "_springmean.tif"))
  
  # Process the joint_data_flower for the current year
  yearly_data <- joint_data_flower %>%
    dplyr::select(year, lat, lon) %>%
    distinct() %>%
    filter(year == fo_year) %>%
    mutate(yearly_temp = extract(yearly_raster, cbind(lon, lat)))
  
  # Append the yearly data to the cumulative data frame
  joint_data_flower_anormality <- rbind(joint_data_flower_anormality, yearly_data)
}

# Combine the normality and anormality data
temperature_data <- joint_data_flower_normality %>%
  right_join(joint_data_flower_anormality, by = c("lat", "lon")) %>%
  rename(norm = complete_period_temp, yeart = yearly_temp) %>%
  mutate(anom = yeart - norm) %>%
  right_join(joint_data_flower, by = c("lat", "lon", "year")) %>%
  filter(!is.na(anom)) 

write.csv(temperature_data, "../data/herb_temperature_data.csv")

3 fit the model

3.1 remove outlier

library(mvoutlier)
Loading required package: sgeostat
temperature_data <- read.csv("../data/herb_temperature_data.csv")

temperature_data_clean <- temperature_data %>%
  group_by(species) %>%
  filter(n() > 30) %>%
  group_modify(~ {
    data_matrix <- dplyr::select(.x, yeart, doy) %>% as.matrix()
    print(.x$species[1])
    outlier_result <- aq.plot(data_matrix)
    .x %>% mutate(outliers = outlier_result$outliers)
  }) %>%
  ungroup()
Warning: Unknown or uninitialised column: `species`.
NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL
Warning: Unknown or uninitialised column: `species`.

NULL

3.2 keep record of number of observation deleted and left

temperature_data_clean %>% 
  group_by(species) %>%
  summarise(total = n(),
            num_outliers = sum(outliers),
            delete_p = num_outliers/total) %>%
  arrange(desc(delete_p))
# A tibble: 74 × 4
   species               total num_outliers delete_p
   <chr>                 <int>        <int>    <dbl>
 1 Quercus sadleriana       61           20    0.328
 2 Betula lenta            188           58    0.309
 3 Acer tataricum           59           17    0.288
 4 Betula papyrifera       190           54    0.284
 5 Fraxinus americana      181           50    0.276
 6 Betula alleghaniensis   145           35    0.241
 7 Betula populifolia      108           24    0.222
 8 Quercus durata           61           13    0.213
 9 Quercus john-tuckeri     38            8    0.211
10 Quercus stellata         87           18    0.207
# ℹ 64 more rows

3.3 get data for model by requiring 10 observations for each parameter

temperature_data_model <- temperature_data_clean %>%
  filter(!outliers) %>%  # Correctly referencing the outliers column
  group_by(species) %>%
  filter(n_distinct(doy) > 10) %>%  # Use n_distinct() for distinct counting
  filter(n_distinct(anom) > 10) %>%  # Use n_distinct() for distinct counting
  filter(n_distinct(norm) > 10) %>%  # Use n_distinct() for distinct counting
  filter(n_distinct(doy, norm, anom) > 30) %>%  # Use n_distinct() for distinct counting
  ungroup() 

# by species colinearity
temperature_data_model %>%
  group_by(species) %>%
  summarise(cor_anom_norm = cor(anom, norm, use = "complete.obs")) %>% 
  arrange(desc(abs(cor_anom_norm)))
# A tibble: 64 × 2
   species            cor_anom_norm
   <chr>                      <dbl>
 1 Quercus parvula           -0.661
 2 Quercus lobata            -0.507
 3 Populus fremontii         -0.495
 4 Betula sandbergii         -0.353
 5 Morus rubra                0.316
 6 Ulmus alata               -0.294
 7 Fraxinus americana        -0.288
 8 Fraxinus velutina         -0.270
 9 Ulmus pumila              -0.265
10 Ulmus americana           -0.255
# ℹ 54 more rows
# by species unique value of doy, norm, anom and combination
# test <- temperature_data_model %>%
#   group_by(species) %>%
#   summarise(n_doy = n_distinct(doy),
#             n_norm = n_distinct(norm),
#             n_anom = n_distinct(anom),
#             n_comb = n_distinct(paste(doy, norm, anom))) %>%
#   arrange(desc(n_comb))

3.4 fit and plot the model

source("../scripts/function_visionalize_summmary_MLmodel.R")

# Apply the function to each species and store the results
results <- list()
unique_species <- unique(temperature_data_model$species)

for (species_name in unique_species) {
  results[[species_name]] <- analyze_species(temperature_data_model, species_name)
}

# Combine all summary rows into a single data frame
summary_results <- bind_rows(lapply(results, function(res) res$summary))

# Save all plots to a single PDF file
pdf("../data/species_plots_herb.pdf", width = 8, height = 6)
for (species_name in unique_species) {
  print(results[[species_name]]$plot)
}
dev.off()

write.csv(summary_results, "../species_summary_herb.csv", row.names = FALSE)

4 get spices composition

library(data.tree)
# Example data
species_data <- temperature_data_model %>%
  group_by(species, genus, family) %>%
  summarise(specimen_count = n())
`summarise()` has grouped output by 'species', 'genus'. You can override using
the `.groups` argument.
# Aggregate data at genus level
genus_data <- species_data %>%
  group_by(family, genus) %>%
  summarise(specimen_count = sum(specimen_count))
`summarise()` has grouped output by 'family'. You can override using the
`.groups` argument.
# Aggregate data at family level
family_data <- species_data %>%
  group_by(family) %>%
  summarise(specimen_count = sum(specimen_count))

# Total specimen count
total_specimen_count <- sum(species_data$specimen_count)

# Create hierarchical data
hierarchical_data <- species_data %>%
  mutate(level = "species") %>%
  bind_rows(genus_data %>% mutate(species = "", level = "genus")) %>%
  bind_rows(family_data %>% mutate(genus = "", species = "", level = "family")) %>%
  ungroup() %>%
  arrange(family, genus, species)

# Create a data tree structure
tree_data <- hierarchical_data %>%
  mutate(pathString = paste("Total", family, genus, species, sep = "/")) %>%
  dplyr::select(pathString, specimen_count)

tree <- as.Node(tree_data)
tree$specimen_count <- total_specimen_count

# Print the tree structure to console
print(tree, "specimen_count")
                            levelName specimen_count
1  Total                                        7336
2   ¦--Betulaceae                               1027
3   ¦   °--Betula                               1027
4   ¦       ¦--Betula alleghaniensis             110
5   ¦       ¦--Betula glandulosa                  64
6   ¦       ¦--Betula lenta                      130
7   ¦       ¦--Betula nigra                       95
8   ¦       ¦--Betula occidentalis               162
9   ¦       ¦--Betula papyrifera                 136
10  ¦       ¦--Betula populifolia                 84
11  ¦       ¦--Betula pumila                     207
12  ¦       °--Betula sandbergii                  39
13  ¦--Fagaceae                                 1845
14  ¦   °--Quercus                              1845
15  ¦       ¦--Quercus agrifolia                 207
16  ¦       ¦--Quercus alba                       75
17  ¦       ¦--Quercus berberidifolia             69
18  ¦       ¦--Quercus chrysolepis               201
19  ¦       ¦--Quercus durata                     48
20  ¦       ¦--Quercus ellipsoidalis              34
21  ¦       ¦--Quercus falcata                    41
22  ¦       ¦--Quercus gambelii                  120
23  ¦       ¦--Quercus garryana                   50
24  ¦       ¦--Quercus ilicifolia                 71
25  ¦       ¦--Quercus kelloggii                  64
26  ¦       ¦--Quercus lobata                     37
27  ¦       ¦--Quercus macrocarpa                 81
28  ¦       ¦--Quercus marilandica                67
29  ¦       ¦--Quercus nigra                      45
30  ¦       ¦--Quercus parvula                    37
31  ¦       ¦--Quercus prinoides                  33
32  ¦       ¦--Quercus rubra                      55
33  ¦       ¦--Quercus sadleriana                 41
34  ¦       ¦--Quercus stellata                   69
35  ¦       ¦--Quercus turbinella                 58
36  ¦       ¦--Quercus vacciniifolia              63
37  ¦       ¦--Quercus velutina                   52
38  ¦       ¦--Quercus virginiana                 46
39  ¦       °--Quercus wislizeni                 181
40  ¦--Moraceae                                  248
41  ¦   °--Morus                                 248
42  ¦       ¦--Morus alba                        111
43  ¦       ¦--Morus microphylla                  45
44  ¦       °--Morus rubra                        92
45  ¦--Oleaceae                                  775
46  ¦   °--Fraxinus                              775
47  ¦       ¦--Fraxinus americana                131
48  ¦       ¦--Fraxinus anomala                  123
49  ¦       ¦--Fraxinus cuspidata                 55
50  ¦       ¦--Fraxinus dipetala                 113
51  ¦       ¦--Fraxinus pennsylvanica            240
52  ¦       °--Fraxinus velutina                 113
53  ¦--Salicaceae                                807
54  ¦   °--Populus                               807
55  ¦       ¦--Populus angustifolia               75
56  ¦       ¦--Populus balsamifera                35
57  ¦       ¦--Populus deltoides                 237
58  ¦       ¦--Populus fremontii                 126
59  ¦       ¦--Populus grandidentata             101
60  ¦       °--Populus tremuloides               233
61  ¦--Sapindaceae                              2084
62  ¦   °--Acer                                 2084
63  ¦       ¦--Acer circinatum                    45
64  ¦       ¦--Acer glabrum                      172
65  ¦       ¦--Acer macrophyllum                 225
66  ¦       ¦--Acer negundo                      462
67  ¦       ¦--Acer pensylvanicum                181
68  ¦       ¦--Acer platanoides                  106
69  ¦       ¦--Acer rubrum                       465
70  ¦       ¦--Acer saccharinum                   89
71  ¦       ¦--Acer saccharum                    130
72  ¦       ¦--Acer spicatum                     167
73  ¦       °--Acer tataricum                     42
74  °--Ulmaceae                                  550
75      °--Ulmus                                 550
76          ¦--Ulmus alata                        99
77          ¦--Ulmus americana                   265
78          ¦--Ulmus pumila                       49
79          °--Ulmus rubra                       137

5 compare spatial and temporal sensitivity

summary_results <- read.csv("../species_summary_herb.csv")

summary_results_wtaxa <- temperature_data_model %>%
  distinct(species, genus, family) %>% 
  right_join(summary_results, by = "species") 

5.1 all species

5.1.1 all species plot

# plot a figure to show the comparasion of anom slope and norm lope with confidence interval and model fitting
ggplot(summary_results_wtaxa, aes(x = anom_estimate, y = norm_estimate, color = genus)) +
  geom_errorbar(aes(ymin = norm_conf_low, ymax = norm_conf_high, alpha = r_squared), width = 0) +
  geom_errorbarh(aes(xmin = anom_conf_low, xmax = anom_conf_high, alpha = r_squared), height = 0) +
  geom_point(aes(alpha = r_squared), size = 3) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +  # Add 1:1 line
  scale_alpha_continuous(range = c(0.2, 1)) +  # Adjust alpha range for better visualization
  labs(
    title = "",
    x = "Temporal sensitivity (days/celsius degree)",
    y = "Spatial sensiticity (days/celsius degree)",
    alpha = "R^2",
    color = "Genus"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

5.1.2 propotion of spatial stronger than temporal

summary_results_wtaxa %>%
  mutate(spatial_stronger = ifelse(anom_estimate > norm_estimate, TRUE, FALSE)) %>%
  group_by(genus) %>%
  summarise(prop_spatial_stronger = mean(spatial_stronger, na.rm = TRUE)) %>%
  arrange(desc(prop_spatial_stronger))
# A tibble: 7 × 2
  genus    prop_spatial_stronger
  <chr>                    <dbl>
1 Betula                   1    
2 Populus                  0.833
3 Acer                     0.818
4 Ulmus                    0.75 
5 Fraxinus                 0.667
6 Morus                    0.667
7 Quercus                  0.64 

5.2 kick the r^2 less than 0.2

5.2.1 plot

summary_results_wtaxa <- summary_results_wtaxa %>% filter(r_squared>0.3)
ggplot(summary_results_wtaxa, aes(x = anom_estimate, y = norm_estimate, color = genus)) +
  geom_errorbar(aes(ymin = norm_conf_low, ymax = norm_conf_high, alpha = r_squared), width = 0) +
  geom_errorbarh(aes(xmin = anom_conf_low, xmax = anom_conf_high, alpha = r_squared), height = 0) +
  geom_point(aes(alpha = r_squared), size = 3) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +  # Add 1:1 line
  scale_alpha_continuous(range = c(0.2, 1)) +  # Adjust alpha range for better visualization
  labs(
    title = "",
    x = "Temporal sensitivity (days/celsius degree)",
    y = "Spatial sensiticity (days/celsius degree)",
    alpha = "R^2",
    color = "Genus"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

5.2.2 propotion of spatial stronger than temporal

summary_results_wtaxa %>%
  mutate(spatial_stronger = ifelse(anom_estimate > norm_estimate, TRUE, FALSE)) %>%
  group_by(genus) %>%
  summarise(prop_spatial_stronger = mean(spatial_stronger, na.rm = TRUE)) %>%
  arrange(desc(prop_spatial_stronger))
# A tibble: 7 × 2
  genus    prop_spatial_stronger
  <chr>                    <dbl>
1 Betula                   1    
2 Ulmus                    1    
3 Acer                     0.778
4 Fraxinus                 0.667
5 Populus                  0.667
6 Quercus                  0.619
7 Morus                    0.5  
length(unique(summary_results_wtaxa$species))
[1] 42

5.2.3 check the not overlap species

to_check <- summary_results_wtaxa %>% 
#filter for the species that the anom slope and norm slope confidence interval do not overlap
  filter(anom_conf_low > norm_conf_high | anom_conf_high < norm_conf_low) %>%
  arrange(desc(r_squared))

print(to_check)
# A tibble: 2 × 10
  species  genus family anom_estimate anom_conf_low anom_conf_high norm_estimate
  <chr>    <chr> <chr>          <dbl>         <dbl>          <dbl>         <dbl>
1 Quercus… Quer… Fagac…        2.44           -1.04           5.92         -4.90
2 Betula … Betu… Betul…       -0.0156         -3.53           3.49         -4.63
# ℹ 3 more variables: norm_conf_low <dbl>, norm_conf_high <dbl>,
#   r_squared <dbl>